16S-rDNA-V3-V4 repository: https://github.com/ycl6/16S-rDNA-V3-V4

phyloseq: GitHub, Documentation, Paper

GUniFrac: Paper

Demo Dataset: PRJEB27564 from Gut Microbiota in Parkinson’s Disease: Temporal Stability and Relations to Disease Progression. EBioMedicine. 2019;44:691-707

License: GPL-3.0


Workflow (Continues from Part 1)

Start R

cd /ngs/16S-Demo/PRJEB27564

R

Load package and set path

library("data.table")
library("phyloseq")
library("DESeq2")
library("ggplot2")
library("ggbeeswarm")
library("ggrepel")
library("vegan")
library("dplyr")

Set the full-path to additional scripts

source("/path-to-script/taxa_summary.R", local = TRUE)

Load saved workspace from Part 1

load("1_dada2_tutorial.RData")

Check phyloseq object

ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 7712 taxa and 126 samples ]
## sample_data() Sample Data:       [ 126 samples by 7 sample variables ]
## tax_table()   Taxonomy Table:    [ 7712 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 7712 tips and 7710 internal nodes ]

Filter Taxa

Subset ps using subset_taxa

Here, we limit the taxa to those from Bacteria, Phylum and Class in not unassigned, and not belonging to the Mitochondria Family

ps0 = subset_taxa(ps, Kingdom == "Bacteria" & !is.na(Phylum) & !is.na(Class) & 
          Family != "Mitochondria")

Create a total counts data.table

tdt = data.table(tax_table(ps0), TotalCounts = taxa_sums(ps0), SV = taxa_names(ps0))

tdt
##        Kingdom     Phylum      Class           Order          Family           Genus
##    1: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##    2: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##    3: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##    4: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##    5: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##   ---                                                                               
## 6536: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 6537: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 6538: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 6539: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 6540: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##       Species TotalCounts     SV
##    1:    <NA>        4311 SV0341
##    2:    <NA>         274 SV1878
##    3:    <NA>          36 SV4447
##    4:    <NA>         176 SV2333
##    5:    <NA>          18 SV5394
##   ---                           
## 6536:    <NA>         173 SV2351
## 6537:    <NA>          18 SV5395
## 6538:    <NA>           6 SV6574
## 6539:    <NA>       15873 SV0105
## 6540:    <NA>      456910 SV0001
ggplot(tdt, aes(TotalCounts)) + geom_histogram(bins = 50) + theme_bw() + 
    ggtitle("Histogram of Total Counts")

tdt[(TotalCounts == 1), .N]     # singletons
## [1] 6
tdt[(TotalCounts == 2), .N]     # doubletons
## [1] 228
# taxa cumulative sum
taxcumsum = tdt[, .N, by = TotalCounts]
setkey(taxcumsum, TotalCounts)
taxcumsum[, CumSum := cumsum(N)]

# Plot the cumulative sum of ASVs against the total counts
pCumSum = ggplot(taxcumsum, aes(TotalCounts, CumSum)) + geom_point() + 
    theme_bw() + xlab("Filtering Threshold") + ylab("ASV Filtered")

png("images/FilterTaxa-taxa-abundance.png", width = 8, height = 8, units = "in", res = 300)
gridExtra::grid.arrange(pCumSum, pCumSum + xlim(0, 500), 
            pCumSum + xlim(0, 100), pCumSum + xlim(0, 50), nrow = 2, 
            top = "ASVs that would be filtered vs. minimum taxa counts threshold")
invisible(dev.off())
"images/FilterTaxa-taxa-abundance.png"

“images/FilterTaxa-taxa-abundance.png”

Create a prevalence data.table

mdt = fast_melt(ps0)

mdt
##          Kingdom     Phylum      Class           Order          Family           Genus
##      1: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##      2: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##      3: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##      4: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##      5: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##     ---                                                                               
## 824036: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 824037: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 824038: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 824039: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 824040: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##         Species TaxaID SampleID count RelativeAbundance
##      1:    <NA> SV0001   C0005B  4573        0.08878405
##      2:    <NA> SV0001   C0009B   910        0.01732113
##      3:    <NA> SV0001   C0019B  2059        0.02722465
##      4:    <NA> SV0001   C0020B  1630        0.04075713
##      5:    <NA> SV0001   C0024B 25665        0.25073271
##     ---                                                
## 824036:    <NA> SV7712   P0120F     0        0.00000000
## 824037:    <NA> SV7712   P0045F     0        0.00000000
## 824038:    <NA> SV7712   P0045B     0        0.00000000
## 824039:    <NA> SV7712   P0083F     0        0.00000000
## 824040:    <NA> SV7712   P0083B     0        0.00000000
prevdt = mdt[, list(Prevalence = sum(count > 0), 
            TotalCounts = sum(count), 
            MaxCounts = max(count)), by = TaxaID]

prevdt
##       TaxaID Prevalence TotalCounts MaxCounts
##    1: SV0001        125      456910     25665
##    2: SV0002         95      325977     20183
##    3: SV0003        109      310061     18097
##    4: SV0004        117      295553     17406
##    5: SV0005        111      283113     19696
##   ---                                        
## 6536: SV7708          1           1         1
## 6537: SV7709          1           1         1
## 6538: SV7710          1           1         1
## 6539: SV7711          1           1         1
## 6540: SV7712          1           1         1
ggplot(prevdt, aes(Prevalence)) + geom_histogram(bins = 50) + theme_bw() +
    ggtitle("Histogram of Taxa Prevalence")

prevdt[(Prevalence == 1), .N]   # singletons
## [1] 2988
prevdt[(Prevalence == 2), .N]   # doubletons
## [1] 1183
ggplot(prevdt, aes(MaxCounts)) + geom_histogram(bins = 100) + xlim(0, 500) +
    theme_bw() + ggtitle("Histogram of Maximum TotalCounts")

table(prevdt$MaxCounts)[1:50]
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22 
##   6 240 230 196 181 165 172 125 135 114 107 105  96  85  85  88  73  84  58  58  74  64 
##  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44 
##  48  48  42  56  47  57  48  41  49  45  51  38  46  41  43  38  40  42  29  33  36  35 
##  45  46  47  48  49  50 
##  27  39  29  34  29  29
prevdt[(MaxCounts == 1)]    # singletons
##    TaxaID Prevalence TotalCounts MaxCounts
## 1: SV7707          1           1         1
## 2: SV7708          1           1         1
## 3: SV7709          1           1         1
## 4: SV7710          1           1         1
## 5: SV7711          1           1         1
## 6: SV7712          1           1         1
# taxa cumulative sum
prevcumsum = prevdt[, .N, by = Prevalence]
setkey(prevcumsum, Prevalence)
prevcumsum[, CumSum := cumsum(N)]

# Plot the cumulative sum of ASVs against the prevalence
pPrevCumSum = ggplot(prevcumsum, aes(Prevalence, CumSum)) + geom_point(size = 2, alpha = 0.5) + 
    theme_bw() + xlab("Filtering Threshold") + ylab("ASVs Filtered") + 
    ggtitle("ASVs that would be filtered vs. minimum sample count threshold")

png("images/FilterTaxa-taxa-prevalence.png", width = 8, height = 8, units = "in", res = 300)
pPrevCumSum
invisible(dev.off())
"images/FilterTaxa-taxa-prevalence.png"

“images/FilterTaxa-taxa-prevalence.png”

Prevalence vs. Total Count Scatter plot

png("images/FilterTaxa-Prevalence-TotalCounts.png", width = 8, height = 8, units = "in", res = 300)
ggplot(prevdt, aes(Prevalence, TotalCounts)) + geom_point(size = 2, alpha = 0.5) + 
    scale_y_log10() + theme_bw() + xlab("Prevalence [No. Samples]") + ylab("TotalCounts [Taxa]")
invisible(dev.off())
"images/FilterTaxa-Prevalence-TotalCounts.png"

“images/FilterTaxa-Prevalence-TotalCounts.png”

Colored by phylum

addPhylum = unique(copy(mdt[, list(TaxaID, Phylum)]))

# Join by TaxaID
setkey(prevdt, TaxaID)
setkey(addPhylum, TaxaID)
prevdt = addPhylum[prevdt]
setkey(prevdt, Phylum)

png("images/FilterTaxa-Prevalence-TotalCounts-Phylum.png", width = 8, height = 8, 
    units = "in", res = 300)
ggplot(prevdt, aes(Prevalence, TotalCounts, color = Phylum)) + 
    geom_point(size = 1, alpha = 0.5) + scale_y_log10() + theme_bw() + 
    facet_wrap(~Phylum, nrow = 4) + theme(legend.position="none") + 
    xlab("Prevalence [No. Samples]") + ylab("Total Abundance")
invisible(dev.off())
"images/FilterTaxa-Prevalence-TotalCounts-Phylum.png"

“images/FilterTaxa-Prevalence-TotalCounts-Phylum.png”

Define filter threshold

Remember to review the images/FilterTaxa-*.png plots to select the best paramters

prevalenceThreshold = 5
abundanceThreshold = 10
maxThreshold = 5

Perform taxa filtering

keepTaxa = prevdt[(Prevalence > prevalenceThreshold & 
           TotalCounts > abundanceThreshold & 
           MaxCounts > maxThreshold), TaxaID]

ps1 = prune_taxa(keepTaxa, ps0)

phy_tree(ps1) = ape::root(phy_tree(ps1), sample(taxa_names(ps1), 1), resolve.root = TRUE)

ps1
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1425 taxa and 126 samples ]
## sample_data() Sample Data:       [ 126 samples by 7 sample variables ]
## tax_table()   Taxonomy Table:    [ 1425 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 1425 tips and 1424 internal nodes ]

Create output files

Create FASTA file

dada2::uniquesToFasta(df[rownames(df) %in% taxa_names(ps1),], "outfiles/expr.asv.fasta", 
              ids = df[rownames(df) %in% taxa_names(ps1),]$id)

Create BIOM file

biomformat::write_biom(biomformat::make_biom(data = t(as.matrix(otu_table(ps1)))), 
               "outfiles/expr.biom")

Create ASV and Taxonomy tables

write.table(as.data.table(otu_table(ps1), keep.rownames = T), file = "outfiles/expr.otu_table.txt", 
        sep = "\t", quote = F, row.names = F, col.names = T)
write.table(as.data.table(tax_table(ps1), keep.rownames = T), file = "outfiles/expr.tax_table.txt", 
        sep = "\t", quote = F, row.names = F, col.names = T)

Create raw abundance tables

write.table(reshape2::dcast(ps1 %>% psmelt() %>% arrange(OTU) %>% rename(ASV = OTU), 
        ASV+Kingdom+Phylum+Class+Order+Family+Genus+Species~Sample_ID, value.var = "Abundance"), 
        file = "outfiles/expr.abundance.all.txt", sep = "\t", 
        quote = F, row.names = F, col.names = T)
write.table(reshape2::dcast(ps1 %>% tax_glom(taxrank = "Phylum") %>% psmelt(), 
        Phylum~Sample_ID, value.var = "Abundance"), 
        file = "outfiles/expr.abundance.abphy.txt", sep = "\t", 
        quote = F, row.names = F, col.names = T)
write.table(reshape2::dcast(ps1 %>% tax_glom(taxrank = "Class") %>% psmelt(), 
        Class~Sample_ID, value.var = "Abundance"), 
        file = "outfiles/expr.abundance.abcls.txt", sep = "\t", 
        quote = F, row.names = F, col.names = T)
write.table(reshape2::dcast(ps1 %>% tax_glom(taxrank = "Family") %>% psmelt(), 
        Family~Sample_ID, value.var = "Abundance"), 
        file = "outfiles/expr.abundance.abfam.txt", sep = "\t", 
        quote = F, row.names = F, col.names = T)
write.table(reshape2::dcast(ps1 %>% tax_glom(taxrank = "Genus") %>% psmelt(), 
        Genus~Sample_ID, value.var = "Abundance"), 
        file = "outfiles/expr.abundance.abgen.txt", sep = "\t", 
        quote = F, row.names = F, col.names = T)

Create relative abundance tables

Use the transform_sample_counts function to transforms the sample counts of a taxa abundance matrix according to the provided function, in the case the fractions of the whole sum

write.table(reshape2::dcast(ps1 %>% transform_sample_counts(function(x) {x/sum(x)} ) %>% 
    psmelt() %>% arrange(OTU) %>% rename(ASV = OTU), 
    ASV+Kingdom+Phylum+Class+Order+Family+Genus+Species~Sample_ID, value.var = "Abundance"), 
    file = "outfiles/expr.relative_abundance.all.txt", sep = "\t", 
    quote = F, row.names = F, col.names = T)
write.table(reshape2::dcast(ps1 %>% tax_glom(taxrank = "Phylum") %>% 
    transform_sample_counts(function(x) {x/sum(x)} ) %>% 
    psmelt(), Phylum~Sample_ID, value.var = "Abundance"), 
    file = "outfiles/expr.relative_abundance.abphy.txt", sep = "\t", 
    quote = F, row.names = F, col.names = T)
write.table(reshape2::dcast(ps1 %>% tax_glom(taxrank = "Class") %>% 
    transform_sample_counts(function(x) {x/sum(x)} ) %>% 
    psmelt(), Class~Sample_ID, value.var = "Abundance"), 
    file = "outfiles/expr.relative_abundance.abcls.txt", sep = "\t", 
    quote = F, row.names = F, col.names = T)
write.table(reshape2::dcast(ps1 %>% tax_glom(taxrank = "Family") %>% 
    transform_sample_counts(function(x) {x/sum(x)} ) %>% 
    psmelt(), Family~Sample_ID, value.var = "Abundance"), 
    file = "outfiles/expr.relative_abundance.abfam.txt", sep = "\t", 
    quote = F, row.names = F, col.names = T)
write.table(reshape2::dcast(ps1 %>% tax_glom(taxrank = "Genus") %>% 
    transform_sample_counts(function(x) {x/sum(x)} ) %>% 
    psmelt(), Genus~Sample_ID, value.var = "Abundance"), 
    file = "outfiles/expr.relative_abundance.abgen.txt", sep = "\t", 
    quote = F, row.names = F, col.names = T)

Plot abundance

# Transform to proportions (relative abundances)
ps1.rp = transform_sample_counts(ps1, function(OTU) OTU/sum(OTU))

# Top N taxa
N = 200
topN = names(sort(taxa_sums(ps1), decreasing=TRUE))[1:N]
ps1.topN = transform_sample_counts(ps1, function(OTU) OTU/sum(OTU))
ps1.topN = prune_taxa(topN, ps1.topN)

ps1.topN
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 200 taxa and 126 samples ]
## sample_data() Sample Data:       [ 126 samples by 7 sample variables ]
## tax_table()   Taxonomy Table:    [ 200 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 200 tips and 199 internal nodes ]

At Phylum level

# All taxa
ptaxa1 = plot_bar(ps1.rp, x = "Sample_ID", fill = "Phylum", 
          title = paste(ntaxa(ps1.rp), "Taxa (Phylum)")) + 
    geom_bar(stat = "identity", size = 0.1, color = "black") + 
    facet_wrap(~Group2, scales = "free_x", nrow = 1) + guides(fill = guide_legend(ncol = 1)) + 
    scale_y_continuous(breaks = seq(0, 1, 0.1)) + 
    theme(legend.title = element_text(size = 8), legend.text = element_text(size = 7), 
          axis.text.x = element_text(size = 5, angle = 90, vjust = 0.5, hjust = 1), 
          axis.text.y = element_text(size = 12)) + xlab("Sample") + ylab("Relative abundance")

# Top 200 taxa
ptaxa2 = plot_bar(ps1.topN, x = "Sample_ID", fill = "Phylum", 
          title = paste("Top",N, "Taxa (Phylum)")) + 
    geom_bar(stat = "identity", size = 0.1, color = "black") + 
    facet_wrap(~Group2, scales = "free_x", nrow = 1) + guides(fill = guide_legend(ncol = 1)) + 
    scale_y_continuous(breaks = seq(0, 1, 0.1)) + 
    theme(legend.title = element_text(size = 8), legend.text = element_text(size = 7), 
          axis.text.x = element_text(size = 5, angle = 90, vjust = 0.5, hjust = 1), 
          axis.text.y = element_text(size = 12)) + xlab("Sample") + ylab("Relative Abundance")

png("images/plot_bar_phylum.png", width = 10, height = 8, units = "in", res = 300)
gridExtra::grid.arrange(ptaxa1, ptaxa2, nrow = 2)
invisible(dev.off())
"images/plot_bar_phylum.png"

“images/plot_bar_phylum.png”

At Class level

ptaxa3 = plot_bar(ps1.rp, x = "Sample_ID", fill = "Class", 
          title = paste(ntaxa(ps1.rp), "Taxa (Class)")) +
        geom_bar(stat = "identity", size = 0.1, color = "black") +
        facet_wrap(~Group2, scales = "free_x", nrow = 1) + guides(fill = guide_legend(ncol = 1)) +
        scale_y_continuous(breaks = seq(0, 1, 0.1)) +
        theme(legend.title = element_text(size = 8), legend.text = element_text(size = 7),
              axis.text.x = element_text(size = 5, angle = 90, vjust = 0.5, hjust = 1),
              axis.text.y = element_text(size = 12)) + xlab("Sample") + ylab("Relative abundance")

ptaxa4 = plot_bar(ps1.topN, x = "Sample_ID", fill = "Class", 
          title = paste("Top",N, "Taxa (Class)")) +
        geom_bar(stat = "identity", size = 0.1, color = "black") +
        facet_wrap(~Group2, scales = "free_x", nrow = 1) + guides(fill = guide_legend(ncol = 1)) +
        scale_y_continuous(breaks = seq(0, 1, 0.1)) +
        theme(legend.title = element_text(size = 8), legend.text = element_text(size = 7),
              axis.text.x = element_text(size = 5, angle = 90, vjust = 0.5, hjust = 1),
              axis.text.y = element_text(size = 12)) + xlab("Sample") + ylab("Relative Abundance")

png("images/plot_bar_class.png", width = 10, height = 8, units = "in", res = 300)
gridExtra::grid.arrange(ptaxa3, ptaxa4, nrow = 2)
invisible(dev.off())
"images/plot_bar_class.png"

“images/plot_bar_class.png”

At Family level

ptaxa5 = plot_bar(ps1.topN, x = "Sample_ID", fill = "Family", 
          title = paste("Top",N, "Taxa (Family)")) +
        geom_bar(stat = "identity", size = 0.1, color = "black") +
        facet_wrap(~Group2, scales = "free_x", nrow = 1) + guides(fill = guide_legend(ncol = 1)) +
        scale_y_continuous(breaks = seq(0, 1, 0.1)) + 
        theme(legend.title = element_text(size = 8), legend.text = element_text(size = 7),
              axis.text.x = element_text(size = 5, angle = 90, vjust = 0.5, hjust = 1),
              axis.text.y = element_text(size = 12)) + xlab("Sample") + ylab("Relative Abundance")

png("images/plot_bar_family.png", width = 10, height = 8, units = "in", res = 300)
ptaxa5
invisible(dev.off())
"images/plot_bar_family.png"

“images/plot_bar_family.png”

At Genus level

ptaxa6 = plot_bar(ps1.topN, x = "Sample_ID", fill = "Genus", 
          title = paste("Top",N, "Taxa (Genus)")) +
        geom_bar(stat = "identity", size = 0.1, color = "black") +
        facet_wrap(~Group2, scales = "free_x", nrow = 1) + guides(fill = guide_legend(ncol = 2)) +
        scale_y_continuous(breaks = seq(0, 1, 0.1)) + 
        theme(legend.title = element_text(size = 8), legend.text = element_text(size = 7),
              axis.text.x = element_text(size = 5, angle = 90, vjust = 0.5, hjust = 1),
              axis.text.y = element_text(size = 12)) + xlab("Sample") + ylab("Relative Abundance")

png("images/plot_bar_genus.png", width = 10, height = 8, units = "in", res = 300)
ptaxa6
invisible(dev.off())
"images/plot_bar_genus.png"

“images/plot_bar_genus.png”

Alpha diversity

Explore alpha diversity using the plot_richness function

# Select alpha-diversity measures
betaM = c("Observed", "Chao1", "Shannon", "Simpson")

png("images/plot_richness.png", width = 6, height = 5, units = "in", res = 300)
plot_richness(ps1, x = "Group2", measures = betaM, color = "Group2", nrow = 1) + 
    geom_point(size = 0.8) + theme_bw() + 
    theme(legend.position = "none", 
          axis.text.x = element_text(size = 10, angle = 90, vjust = 0.5, hjust = 1), 
          axis.text.y = element_text(size = 10)) + 
    labs(x = "Group", y = "Alpha Diversity Measure")
invisible(dev.off())
"images/plot_richness.png"

“images/plot_richness.png”

Create a long-format data.frame and plot with ggplot function

ad = estimate_richness(ps1, measures = betaM)
ad = merge(data.frame(sample_data(ps1)), ad, by = "row.names")
ad = reshape2::melt(ad[,c("Sample_ID", "Group2", betaM)], id = c("Sample_ID", "Group2"))

png("images/plot_richness_boxplot.png", width = 6, height = 5, units = "in", res = 300)
ggplot(ad, aes(Group2, value, color = Group2)) + 
    geom_boxplot(outlier.shape = NA, size = 0.8, width = 0.8) + 
    geom_quasirandom(size = 0.8, color = "black") + theme_bw() + 
    facet_wrap(~ variable, scales = "free_y", nrow = 1) + 
    theme(legend.position = "none", 
          axis.text.x = element_text(size = 10, angle = 90, vjust = 0.5, hjust = 1), 
          axis.text.y = element_text(size = 10)) + 
    labs(x = "Group", y = "Alpha Diversity Measure")
invisible(dev.off())
"images/plot_richness_boxplot.png"

“images/plot_richness_boxplot.png”

Beta diversity

Perform various flavors of unifrac measurements using the GUniFrac package

unifracs = GUniFrac::GUniFrac(otu_table(ps1), phy_tree(ps1), alpha = c(0, 0.5, 1))$unifracs

Perform ordination

# Calculate geometric means prior to estimate size factors
gm_mean = function(x, na.rm = TRUE){
    exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# Perform variance stabilizing transformation
ps1.vsd = ps1
dds = phyloseq_to_deseq2(ps1, ~ Group2)
## converting counts to integer mode
geoMeans = apply(counts(dds), 1, gm_mean)
dds = estimateSizeFactors(dds, geoMeans = geoMeans)
dds = estimateDispersions(dds)
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
abund.vsd = getVarianceStabilizedData(dds)
abund.vsd[abund.vsd < 0] = 0    # set negative values to 0
otu_table(ps1.vsd) = otu_table(abund.vsd, taxa_are_rows = TRUE)

# Create distance objects
dist_un = as.dist(unifracs[, , "d_UW"])       # Unweighted UniFrac
attr(dist_un, "method") = "Unweighted UniFrac"

dist_wu = as.dist(unifracs[, , "d_1"])        # Weighted UniFrac
attr(dist_wu, "method") = "Weighted UniFrac"

dist_vu = as.dist(unifracs[, , "d_VAW"])      # Variance-adjusted-weighted UniFrac
attr(dist_vu, "method") = "Variance-adjusted-weighted UniFrac"

dist_gu = as.dist(unifracs[, , "d_0.5"])      # GUniFrac with alpha 0.5
attr(dist_gu, "method") = "GUniFrac with alpha 0.5"

dist_bc = phyloseq::distance(ps1.vsd, "bray") # Bray-Curtis

# Perform ordination
ord_un = ordinate(ps1, method = "PCoA", distance = dist_un)
ord_wu = ordinate(ps1, method = "PCoA", distance = dist_wu)
ord_vu = ordinate(ps1, method = "PCoA", distance = dist_vu)
ord_gu = ordinate(ps1, method = "PCoA", distance = dist_gu)
set.seed(12345)
ord_bc = ordinate(ps1, method = "NMDS", distance = dist_bc)
## Run 0 stress 0.1643532 
## Run 1 stress 0.1643515 
## ... New best solution
## ... Procrustes: rmse 0.0004348073  max resid 0.003627076 
## ... Similar to previous best
## Run 2 stress 0.4123846 
## Run 3 stress 0.1647262 
## ... Procrustes: rmse 0.007101905  max resid 0.07513243 
## Run 4 stress 0.1647288 
## ... Procrustes: rmse 0.006996405  max resid 0.0753685 
## Run 5 stress 0.1660952 
## Run 6 stress 0.1657001 
## Run 7 stress 0.1643496 
## ... New best solution
## ... Procrustes: rmse 0.0007611539  max resid 0.007539293 
## ... Similar to previous best
## Run 8 stress 0.1793736 
## Run 9 stress 0.1656961 
## Run 10 stress 0.1643504 
## ... Procrustes: rmse 0.0008262269  max resid 0.006127567 
## ... Similar to previous best
## Run 11 stress 0.1657059 
## Run 12 stress 0.1647458 
## ... Procrustes: rmse 0.005757801  max resid 0.06097133 
## Run 13 stress 0.1647282 
## ... Procrustes: rmse 0.007056738  max resid 0.07573969 
## Run 14 stress 0.1647293 
## ... Procrustes: rmse 0.007042765  max resid 0.07538188 
## Run 15 stress 0.189651 
## Run 16 stress 0.1657084 
## Run 17 stress 0.1661109 
## Run 18 stress 0.1788554 
## Run 19 stress 0.1647292 
## ... Procrustes: rmse 0.006985754  max resid 0.07564235 
## Run 20 stress 0.1657013 
## *** Solution reached

Output to PDF

pdf("images/plot_ordination.pdf", width = 6, height = 5, pointsize = 12)
plot_ordination(ps1, ord_un, type = "samples", color = "Group2", 
        title = "PCoA on unweighted UniFrac") + 
        theme_bw() + coord_fixed(ratio = 1) + 
        stat_ellipse(aes(group = Group2), type = "t", size = 0.5, linetype = "dashed")

plot_ordination(ps1, ord_wu, type = "samples", color = "Group2", 
        title = "PCoA on weighted UniFrac") + 
        theme_bw() + coord_fixed(ratio = 1) + 
        stat_ellipse(aes(group = Group2), type = "t", size = 0.5, linetype = "dashed")

plot_ordination(ps1, ord_vu, type = "samples", color = "Group2", 
        title = "PCoA on Variance adjusted weighted UniFrac") + 
        theme_bw() + coord_fixed(ratio = 1) + 
        stat_ellipse(aes(group = Group2), type = "t", size = 0.5, linetype = "dashed")

plot_ordination(ps1, ord_gu, type = "samples", color = "Group2", 
        title = "PCoA on GUniFrac with alpha 0.5") + 
        theme_bw() + coord_fixed(ratio = 1) + 
        stat_ellipse(aes(group = Group2), type = "t", size = 0.5, linetype = "dashed")

plot_ordination(ps1, ord_bc, type = "samples", color = "Group2", 
        title = "NMDS on Bray-Curtis distance (VST)") + 
        theme_bw() + coord_fixed(ratio = 1) + 
        stat_ellipse(aes(group = Group2), type = "t", size = 0.5, linetype = "dashed")
invisible(dev.off())

Example: PCoA on weighted UniFrac

Multivariate analyses

PERMANOVA

Permutational multivariate analysis of variance (PERMANOVA) test for differences between independent groups using the adonis function

adonis(dist_un ~ Group2, data.frame(sample_data(ps1)), permutations = 1000)
## 
## Call:
## adonis(formula = dist_un ~ Group2, data = data.frame(sample_data(ps1)),      permutations = 1000) 
## 
## Permutation: free
## Number of permutations: 1000
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs MeanSqs F.Model     R2 Pr(>F)
## Group2      3    0.3542 0.11805   1.163 0.0278 0.1339
## Residuals 122   12.3842 0.10151         0.9722       
## Total     125   12.7383                 1.0000
adonis(dist_wu ~ Group2, data.frame(sample_data(ps1)), permutations = 1000)
## 
## Call:
## adonis(formula = dist_wu ~ Group2, data = data.frame(sample_data(ps1)),      permutations = 1000) 
## 
## Permutation: free
## Number of permutations: 1000
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)
## Group2      3    0.1154 0.038455  1.2639 0.03014 0.1808
## Residuals 122    3.7118 0.030425         0.96986       
## Total     125    3.8272                  1.00000
adonis(dist_vu ~ Group2, data.frame(sample_data(ps1)), permutations = 1000)
## 
## Call:
## adonis(formula = dist_vu ~ Group2, data = data.frame(sample_data(ps1)),      permutations = 1000) 
## 
## Permutation: free
## Number of permutations: 1000
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)
## Group2      3   0.08185 0.027284  1.1939 0.02852 0.2468
## Residuals 122   2.78803 0.022853         0.97148       
## Total     125   2.86989                  1.00000
adonis(dist_gu ~ Group2, data.frame(sample_data(ps1)), permutations = 1000)
## 
## Call:
## adonis(formula = dist_gu ~ Group2, data = data.frame(sample_data(ps1)),      permutations = 1000) 
## 
## Permutation: free
## Number of permutations: 1000
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)
## Group2      3    0.3387 0.11291  1.1061 0.02648 0.2228
## Residuals 122   12.4546 0.10209         0.97352       
## Total     125   12.7933                 1.00000
adonis(dist_bc ~ Group2, data.frame(sample_data(ps1)), permutations = 1000)
## 
## Call:
## adonis(formula = dist_bc ~ Group2, data = data.frame(sample_data(ps1)),      permutations = 1000) 
## 
## Permutation: free
## Number of permutations: 1000
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)
## Group2      3    0.6685 0.22283  1.1591 0.02771 0.1179
## Residuals 122   23.4537 0.19224         0.97229       
## Total     125   24.1222                 1.00000

Multivariate dispersion

Test for homogeneity condition among groups using the betadisper function

disp1 = betadisper(dist_un, group = data.frame(sample_data(ps1))$Group2)
permutest(disp1, permutations = 1000)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 1000
## 
## Response: Distances
##            Df   Sum Sq    Mean Sq      F N.Perm Pr(>F)
## Groups      3 0.002917 0.00097247 0.5971   1000 0.6284
## Residuals 122 0.198686 0.00162857
disp2 = betadisper(dist_wu, group = data.frame(sample_data(ps1))$Group2) 
permutest(disp2, permutations = 1000)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 1000
## 
## Response: Distances
##            Df   Sum Sq   Mean Sq      F N.Perm Pr(>F)
## Groups      3 0.006617 0.0022055 1.0552   1000 0.3536
## Residuals 122 0.254995 0.0020901
disp3 = betadisper(dist_vu, group = data.frame(sample_data(ps1))$Group2)
permutest(disp3, permutations = 1000)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 1000
## 
## Response: Distances
##            Df  Sum Sq    Mean Sq      F N.Perm Pr(>F)
## Groups      3 0.00155 0.00051714 0.1818   1000 0.9151
## Residuals 122 0.34708 0.00284493
disp4 = betadisper(dist_gu, group = data.frame(sample_data(ps1))$Group2)
permutest(disp4, permutations = 1000)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 1000
## 
## Response: Distances
##            Df   Sum Sq   Mean Sq      F N.Perm Pr(>F)
## Groups      3 0.004338 0.0014459 0.5911   1000 0.6014
## Residuals 122 0.298427 0.0024461
disp5 = betadisper(dist_bc, group = data.frame(sample_data(ps1))$Group2)
permutest(disp5, permutations = 1000)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 1000
## 
## Response: Distances
##            Df  Sum Sq  Mean Sq      F N.Perm Pr(>F)
## Groups      3 0.01867 0.006223 1.7644   1000 0.1638
## Residuals 122 0.43029 0.003527

Example: Dispersion of unweighted UniFrac

plot(disp3, hull = FALSE, ellipse = TRUE)

Save current workspace

save.image(file = "2_phyloseq_tutorial.RData")

Session information

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/ihsuan/miniconda3/envs/p27/lib/libopenblasp-r0.3.7.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C               LC_TIME=en_GB.UTF-8       
##  [4] LC_COLLATE=en_GB.UTF-8     LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods  
## [9] base     
## 
## other attached packages:
##  [1] dplyr_1.0.0                 vegan_2.5-6                 lattice_0.20-41            
##  [4] permute_0.9-5               ggrepel_0.8.2               ggbeeswarm_0.6.0           
##  [7] ggplot2_3.3.2               DESeq2_1.26.0               SummarizedExperiment_1.16.1
## [10] DelayedArray_0.12.3         BiocParallel_1.20.1         matrixStats_0.56.0         
## [13] Biobase_2.46.0              GenomicRanges_1.38.0        GenomeInfoDb_1.22.1        
## [16] IRanges_2.20.2              S4Vectors_0.24.4            BiocGenerics_0.32.0        
## [19] phyloseq_1.30.0             data.table_1.12.8           knitr_1.29                 
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.4-1         hwriter_1.3.2            ellipsis_0.3.1          
##  [4] htmlTable_2.0.1          XVector_0.26.0           base64enc_0.1-3         
##  [7] rstudioapi_0.11          farver_2.0.3             bit64_0.9-7             
## [10] AnnotationDbi_1.48.0     codetools_0.2-16         splines_3.6.2           
## [13] geneplotter_1.64.0       dada2_1.14.1             ade4_1.7-15             
## [16] Formula_1.2-3            jsonlite_1.7.0           Rsamtools_2.2.3         
## [19] annotate_1.64.0          cluster_2.1.0            png_0.1-7               
## [22] compiler_3.6.2           backports_1.1.8          Matrix_1.2-18           
## [25] acepack_1.4.1            htmltools_0.5.0          tools_3.6.2             
## [28] igraph_1.2.5             gtable_0.3.0             glue_1.4.1              
## [31] GenomeInfoDbData_1.2.2   reshape2_1.4.4           ShortRead_1.44.3        
## [34] Rcpp_1.0.5               vctrs_0.3.1              Biostrings_2.54.0       
## [37] multtest_2.42.0          ape_5.4                  nlme_3.1-148            
## [40] iterators_1.0.12         xfun_0.15                stringr_1.4.0           
## [43] lifecycle_0.2.0          XML_3.98-1.20            zlibbioc_1.32.0         
## [46] MASS_7.3-51.6            scales_1.1.1             biomformat_1.14.0       
## [49] rhdf5_2.30.1             RColorBrewer_1.1-2       yaml_2.2.1              
## [52] memoise_1.1.0            gridExtra_2.3            rpart_4.1-15            
## [55] latticeExtra_0.6-29      stringi_1.4.6            RSQLite_2.2.0           
## [58] highr_0.8                genefilter_1.68.0        foreach_1.5.0           
## [61] checkmate_2.0.0          rlang_0.4.6              pkgconfig_2.0.3         
## [64] bitops_1.0-6             evaluate_0.14            purrr_0.3.4             
## [67] Rhdf5lib_1.8.0           GenomicAlignments_1.22.1 htmlwidgets_1.5.1       
## [70] labeling_0.3             bit_1.1-15.2             tidyselect_1.1.0        
## [73] plyr_1.8.6               magrittr_1.5             R6_2.4.1                
## [76] generics_0.0.2           Hmisc_4.4-0              DBI_1.1.0               
## [79] pillar_1.4.5             foreign_0.8-73           withr_2.2.0             
## [82] mgcv_1.8-31              survival_3.2-3           RCurl_1.98-1.2          
## [85] nnet_7.3-14              tibble_3.0.2             crayon_1.3.4            
## [88] rmarkdown_2.3            jpeg_0.1-8.1             locfit_1.5-9.4          
## [91] grid_3.6.2               blob_1.2.1               digest_0.6.25           
## [94] xtable_1.8-4             RcppParallel_5.0.2       munsell_0.5.0           
## [97] beeswarm_0.2.3           vipor_0.4.5